Part 1

Tidy up the data ready for regression

Load libraries and data:

library(car)
library(tidyverse)
library(modelr)
library(GGally)
library(relaimpo)
library(lm.beta)
library(fastDummies)

houses <- read_csv("data/kc_house_data.csv")

Check for any missing values:

houses %>%
  summarise_all(funs(sum(is.na(.))))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Remove columns that are not useful:

houses_clean <- houses %>% 
  dplyr::select(-c(date, id, sqft_living15, sqft_lot15, zipcode))

names(houses_clean)
##  [1] "price"         "bedrooms"      "bathrooms"     "sqft_living"  
##  [5] "sqft_lot"      "floors"        "waterfront"    "view"         
##  [9] "condition"     "grade"         "sqft_above"    "sqft_basement"
## [13] "yr_built"      "yr_renovated"  "lat"           "long"

Covert waterfront into a logical variable:

houses_clean <- houses_clean %>% 
  mutate(waterfront = ifelse(waterfront == 1, T, F))

houses_clean

Convert ‘yr_renovated’ into a ‘renovated’ logical variable:

houses_clean <- houses_clean %>% 
  mutate(renovated = ifelse(yr_renovated == 0, F, T)) %>% 
  dplyr::select(-yr_renovated)

houses_clean

‘Condition’ and ‘Grade’ are both categorical variables. To model the data accurately, it is important to create ‘dummy variables’ for both condition and grade, which are ‘true’ or ‘false’ only:

houses_clean %>% 
  summarise(min_grade = min(grade), max_grade = max(grade),
            min_condition = min(condition), max_condition = max(condition))
houses_dummy <- houses_clean %>%
  dummy_cols(select_columns = c("grade", "condition"), remove_first_dummy = T) %>% 
  dplyr::select(-c(condition, grade))

houses_dummy

Part 2

Check and remove aliased variables

alias(lm(price ~ ., data = houses_dummy))
## Model :
## price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + 
##     waterfront + view + sqft_above + sqft_basement + yr_built + 
##     lat + long + renovated + grade_3 + grade_4 + grade_5 + grade_6 + 
##     grade_7 + grade_8 + grade_9 + grade_10 + grade_11 + grade_12 + 
##     grade_13 + condition_2 + condition_3 + condition_4 + condition_5
## 
## Complete :
##               (Intercept) bedrooms bathrooms sqft_living sqft_lot floors
## sqft_basement  0           0        0         1           0        0    
##               waterfrontTRUE view sqft_above yr_built lat long renovatedTRUE
## sqft_basement  0              0   -1          0        0   0    0           
##               grade_3 grade_4 grade_5 grade_6 grade_7 grade_8 grade_9 grade_10
## sqft_basement  0       0       0       0       0       0       0       0      
##               grade_11 grade_12 grade_13 condition_2 condition_3 condition_4
## sqft_basement  0        0        0        0           0           0         
##               condition_5
## sqft_basement  0
# alias = 'sqft_basement'
houses_trim <- houses_dummy %>% 
  dplyr::select(-sqft_basement)

houses_trim
alias(lm(price ~ ., data = houses_trim))
## Model :
## price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + 
##     waterfront + view + sqft_above + yr_built + lat + long + 
##     renovated + grade_3 + grade_4 + grade_5 + grade_6 + grade_7 + 
##     grade_8 + grade_9 + grade_10 + grade_11 + grade_12 + grade_13 + 
##     condition_2 + condition_3 + condition_4 + condition_5
# no aliases found

Part 3

Regression Models

First predictor

houses_trim_numeric <- houses_trim %>%
  select_if(is.numeric)

houses_trim_nonnumeric <- houses_trim %>%
  select_if(function(x) !is.numeric(x))

houses_trim_nonnumeric$price <- houses_trim$price

ggpairs(houses_trim_numeric)

ggpairs(houses_trim_nonnumeric)

Model 1

Price ~ Bedrooms

model1 <- lm(price ~ bedrooms, data = houses_trim)

model1
## 
## Call:
## lm(formula = price ~ bedrooms, data = houses_trim)
## 
## Coefficients:
## (Intercept)     bedrooms  
##      129802       121716

This tells us that the predicted ‘price’ will increase by 121,716 for every extra bedroom.

Summary

summary(model1)
## 
## Call:
## lm(formula = price ~ bedrooms, data = houses_trim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3506435  -203235   -66667   105049  6839901 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   129802       8932   14.53   <2e-16 ***
## bedrooms      121716       2554   47.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 349200 on 21611 degrees of freedom
## Multiple R-squared:  0.09508,    Adjusted R-squared:  0.09504 
## F-statistic:  2271 on 1 and 21611 DF,  p-value: < 2.2e-16

This tells us that 9.5% of the variance in price is based on the number of bedrooms

Plot

par(mfrow = c(2, 2)) 

plot(model1)

Model 2

Price ~ sqft_living

model2 <- lm(price ~ sqft_living, data = houses_trim)

model2
## 
## Call:
## lm(formula = price ~ sqft_living, data = houses_trim)
## 
## Coefficients:
## (Intercept)  sqft_living  
##    -43580.7        280.6

This tells us that the predicted ‘price’ will increase by 280.6 for every sqft.

Summary

summary(model2)
## 
## Call:
## lm(formula = price ~ sqft_living, data = houses_trim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1476062  -147486   -24043   106182  4362067 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -43580.743   4402.690  -9.899   <2e-16 ***
## sqft_living    280.624      1.936 144.920   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 261500 on 21611 degrees of freedom
## Multiple R-squared:  0.4929, Adjusted R-squared:  0.4928 
## F-statistic: 2.1e+04 on 1 and 21611 DF,  p-value: < 2.2e-16

This tells us that 49% of the variance in price is based on the sqft of living.

Plot

par(mfrow = c(2, 2)) 

plot(model2)

Model 3

Price ~ Condition

model3 <- lm(price ~ condition_2 + condition_3 + condition_4 + condition_5, data = houses_trim)

model3
## 
## Call:
## lm(formula = price ~ condition_2 + condition_3 + condition_4 + 
##     condition_5, data = houses_trim)
## 
## Coefficients:
## (Intercept)  condition_2  condition_3  condition_4  condition_5  
##      334432        -7145       207581       186769       277986

This tells us that ‘condition 5’, increase the price the most.

Summary

summary(model3)
## 
## Call:
## lm(formula = price ~ condition_2 + condition_3 + condition_4 + 
##     condition_5, data = houses_trim)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -502418 -217013  -87013  102800 7178800 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   334432      66803   5.006 5.59e-07 ***
## condition_2    -7144      72395  -0.099  0.92139    
## condition_3   207581      66875   3.104  0.00191 ** 
## condition_4   186769      66979   2.788  0.00530 ** 
## condition_5   277986      67390   4.125 3.72e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 365900 on 21608 degrees of freedom
## Multiple R-squared:  0.006878,   Adjusted R-squared:  0.006694 
## F-statistic: 37.41 on 4 and 21608 DF,  p-value: < 2.2e-16

This tells us that condition only accounts for 0.6% of the variance in price.

Plot

par(mfrow = c(2, 2)) 

plot(model3)

Model 4

Price ~ Waterfront

model4 <- lm(price ~ waterfront, data = houses_trim)

model4
## 
## Call:
## lm(formula = price ~ waterfront, data = houses_trim)
## 
## Coefficients:
##    (Intercept)  waterfrontTRUE  
##         531564         1130312
summary(model4)
## 
## Call:
## lm(formula = price ~ waterfront, data = houses_trim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1376876  -211564   -81564   108436  7168436 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      531564       2416  220.00   <2e-16 ***
## waterfrontTRUE  1130312      27822   40.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 353900 on 21611 degrees of freedom
## Multiple R-squared:  0.07095,    Adjusted R-squared:  0.07091 
## F-statistic:  1650 on 1 and 21611 DF,  p-value: < 2.2e-16

Model 5

Price ~ bedrooms + sqft_living

model5 <- lm(price ~ bedrooms + sqft_living, data = houses_trim)

summary(model5)
## 
## Call:
## lm(formula = price ~ bedrooms + sqft_living, data = houses_trim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1650867  -143866   -23143   102344  4179850 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  79469.359   6604.764   12.03   <2e-16 ***
## bedrooms    -57066.759   2308.223  -24.72   <2e-16 ***
## sqft_living    313.949      2.337  134.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 257800 on 21610 degrees of freedom
## Multiple R-squared:  0.5068, Adjusted R-squared:  0.5068 
## F-statistic: 1.11e+04 on 2 and 21610 DF,  p-value: < 2.2e-16

This tells us that bedrooms and sqft_living account for 50% of the variance in price.